# Computations
import pandas as pd
import numpy as np
# sklearn
from sklearn.impute import SimpleImputer
# Tools
import os
import datetime
import itertools
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## matplotlib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## seaborn
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
plt.style.use('seaborn-whitegrid')
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
In this article, a recommender system is developed that suggests similar movies based on the MovieLens datasets.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
PATH = 'movielens'
# The full dataset
## col names from the readme file
Cols = ['user id','item id','rating','timestamp']
Cols = [x.title().replace('Id','ID') for x in Cols]
data_df = pd.read_csv(os.path.join(PATH, 'u.data'), sep='\t', names= Cols)
data_df = data_df.rename(columns = {'Item ID': 'Movie ID'})
Header('Full dataset')
display(data_df)
# Information about the movies
## col names from the readme file
Cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action','Adventure',
'Animation',"""Children's""",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical',
'Mystery','Romance','Sci-Fi','Thriller','War','Western']
Cols = [x.title().replace('Id','ID') for x in Cols]
item_df = pd.read_csv(os.path.join(PATH, 'u.item'), sep='|', names= Cols, encoding = 'iso-8859-1')
item_df.drop(columns = ['Video Release Date', 'Imdb Url'], inplace = True)
Header('Information about the movies')
display(item_df)
# Demographic information about the users
## col names from the readme file
Cols = ['user id','age','gender','occupation','zip code']
Cols = [x.title().replace('Id','ID') for x in Cols]
user_df = pd.read_csv(os.path.join(PATH, 'u.user'), sep='|', names= Cols)
Header('Demographic information about the users')
user_df.replace({'M':'Male', 'F':'Female'}, inplace = True)
user_df['Occupation'] = user_df['Occupation'].apply(lambda x: x.title())
display(user_df)
Line()
del Cols
Full dataset =======================================================================================
| User ID | Movie ID | Rating | Timestamp | |
|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 |
| 1 | 186 | 302 | 3 | 891717742 |
| 2 | 22 | 377 | 1 | 878887116 |
| 3 | 244 | 51 | 2 | 880606923 |
| 4 | 166 | 346 | 1 | 886397596 |
| ... | ... | ... | ... | ... |
| 99995 | 880 | 476 | 3 | 880175444 |
| 99996 | 716 | 204 | 5 | 879795543 |
| 99997 | 276 | 1090 | 1 | 874795795 |
| 99998 | 13 | 225 | 2 | 882399156 |
| 99999 | 12 | 203 | 3 | 879959583 |
100000 rows × 4 columns
Information about the movies =======================================================================
| Movie ID | Movie Title | Release Date | Unknown | Action | Adventure | Animation | Children'S | Comedy | Crime | ... | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1677 | 1678 | Mat' i syn (1997) | 06-Feb-1998 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1678 | 1679 | B. Monkey (1998) | 06-Feb-1998 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 1679 | 1680 | Sliding Doors (1998) | 01-Jan-1998 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1680 | 1681 | You So Crazy (1994) | 01-Jan-1994 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1681 | 1682 | Scream of Stone (Schrei aus Stein) (1991) | 08-Mar-1996 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1682 rows × 22 columns
Demographic information about the users ============================================================
| User ID | Age | Gender | Occupation | Zip Code | |
|---|---|---|---|---|---|
| 0 | 1 | 24 | Male | Technician | 85711 |
| 1 | 2 | 53 | Female | Other | 94043 |
| 2 | 3 | 23 | Male | Writer | 32067 |
| 3 | 4 | 24 | Male | Technician | 43537 |
| 4 | 5 | 33 | Female | Other | 15213 |
| ... | ... | ... | ... | ... | ... |
| 938 | 939 | 26 | Female | Student | 33319 |
| 939 | 940 | 32 | Male | Administrator | 02215 |
| 940 | 941 | 20 | Male | Student | 97229 |
| 941 | 942 | 48 | Female | Librarian | 78209 |
| 942 | 943 | 22 | Male | Student | 77841 |
943 rows × 5 columns
====================================================================================================
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
display(Data_info(item_df, Only_NaN = True))
| Data Type | Number of NaN Values | Percentage | |
|---|---|---|---|
| Release Date | object | 1 | 0.06 |
Imputing these missing values.
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Temp = imp.fit_transform(item_df['Release Date'].values.reshape(-1,1))
item_df['Release Date'] = Temp
del Temp
Creating Movie info dataset (for analysis only)
# Release date to datetime
item_df['Release Date'] = pd.to_datetime(item_df['Release Date'])
# adding release year
item_df['Release Year'] = pd.to_numeric(item_df['Release Date'].dt.year, downcast = 'integer')
# Removing year from movie title
# item_df['Movie Title'] = item_df['Movie Title'].str.replace(r'[\(\)\d]+', '')
item_df['Movie Title'] = item_df['Movie Title'].str.replace(r'\(\d+\)', '').apply(lambda x: x.rstrip())
# Creating a new database for our analysis
Temp = item_df.iloc[:,3:]
Temp['Movie ID'] = item_df['Movie ID']
Temp.set_index('Movie ID',inplace=True)
Temp = Temp[Temp==1].stack().reset_index().drop(0,1).rename(columns = {'level_1':'Genre'})
Temp = Temp.groupby(['Movie ID'])['Genre'].apply(lambda x: ', '.join(x)).reset_index()
Movie_info = item_df.iloc[:,[0,1,2,-1]].merge(Temp, how = 'outer', on = 'Movie ID')
del Temp
item_df.to_csv('movielens/item_df.csv',index=False)
# full dataset
df = data_df.merge(user_df, how = 'outer', on = 'User ID')
We can create Age Categories using statcan.gc.ca.
def Age_Group(Data):
if Data['Age'].min() < 14:
bins = pd.IntervalIndex.from_tuples([(0, 14), (14, 24), (24, 64),(64, Data.Age.max())])
else:
bins = pd.IntervalIndex.from_tuples([(14, 24), (24, 64),(64, Data.Age.max())])
Data['Age Group'] = pd.cut(Data['Age'], bins)
Data['Age Category'] = Data['Age Group'].astype(str).replace({'(0, 14]': 'Children',
'(14, 24]':'Youth',
'(24, 64]':'Adults',
'(64, ' + str(Data['Age'].max()) + ']':'Seniors'})
return Data
df = Age_Group(df)
df.to_csv('movielens/full_df.csv',index=False)
Creating a dataset that we use here for our analysis
df = df.merge(Movie_info, how = 'outer', on = 'Movie ID').sort_values(by=['User ID'])
# Full dataset that we use here for our analysis
display(df)
| User ID | Movie ID | Rating | Timestamp | Age | Gender | Occupation | Zip Code | Age Group | Age Category | Movie Title | Release Date | Release Year | Genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30897 | 1 | 161 | 4 | 875072303 | 24 | Male | Technician | 85711 | (14, 24] | Youth | Top Gun | 1986-01-01 | 1986 | Action, Romance |
| 30527 | 1 | 144 | 4 | 875073180 | 24 | Male | Technician | 85711 | (14, 24] | Youth | Die Hard | 1988-01-01 | 1988 | Action, Thriller |
| 82356 | 1 | 41 | 2 | 876892818 | 24 | Male | Technician | 85711 | (14, 24] | Youth | Billy Madison | 1995-01-01 | 1995 | Comedy |
| 5247 | 1 | 13 | 5 | 875071805 | 24 | Male | Technician | 85711 | (14, 24] | Youth | Mighty Aphrodite | 1995-10-30 | 1995 | Comedy |
| 24073 | 1 | 208 | 5 | 878542960 | 24 | Male | Technician | 85711 | (14, 24] | Youth | Young Frankenstein | 1974-01-01 | 1974 | Comedy, Horror |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15325 | 943 | 568 | 3 | 888639042 | 22 | Male | Student | 77841 | (14, 24] | Youth | Speed | 1994-01-01 | 1994 | Action, Romance, Thriller |
| 42349 | 943 | 1047 | 2 | 875502146 | 22 | Male | Student | 77841 | (14, 24] | Youth | Multiplicity | 1996-07-12 | 1996 | Comedy |
| 43832 | 943 | 685 | 4 | 875502042 | 22 | Male | Student | 77841 | (14, 24] | Youth | Executive Decision | 1996-03-09 | 1996 | Action, Thriller |
| 93741 | 943 | 1330 | 3 | 888692465 | 22 | Male | Student | 77841 | (14, 24] | Youth | An Unforgettable Summer | 1994-01-01 | 1994 | Drama |
| 48091 | 943 | 401 | 1 | 888639867 | 22 | Male | Student | 77841 | (14, 24] | Youth | Brady Bunch Movie, The | 1995-01-01 | 1995 | Comedy |
100000 rows × 14 columns
Temp = df.groupby(by = ['Movie Title','Gender'])['Movie Title'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index()
Temp['Percentage'] = np.round(100* Temp['Count'] /Temp['Count'].sum(), 2)
Temp.sort_values(by=['Gender', 'Count'], inplace = True, ascending = False)
fig = make_subplots(rows=1, cols=1, vertical_spacing = 0.08)
fig1 = px.bar(Temp.loc[Temp['Movie Title'].isin(Temp['Movie Title'][:20].unique().tolist())],
x= 'Movie Title', y= 'Percentage', orientation='v', color = 'Gender', text = 'Percentage',
hover_data= Temp.columns, color_discrete_sequence = ['Violet', 'BlueViolet'])
for j in range(len(fig1['data'])):
fig.add_trace(fig1['data'][j], row=1, col=1)
# Update
fig.update_layout(height= 600)
fig.update_traces(marker_line_color= 'Black', marker_line_width=1, opacity=1, row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='v')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition= 'outside')
fig.update_layout(title={'text': 'Most Watched Movie Titles by Gender',
'x':0.50, 'y': 0.9, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
del Temp
Ratings = df.groupby(['Movie Title'])['Rating'].agg({'count','mean'})
Ratings.columns = ['Average Rating', 'Number of Ratings']
Ratings.sort_values(by=['Number of Ratings'],ascending=False, inplace = True)
Ratings.reset_index(inplace = True)
Feat = 'Average Rating'
yLim = int(500)
fig = px.histogram(Ratings, x = Feat, nbins=10, marginal= 'box', color_discrete_sequence= ['LimeGreen'],
hover_data=[Feat])
fig.update_layout(plot_bgcolor= 'white', height = 500)
fig.update_layout(title={'text': 'Average Rating Distribution', 'x':0.5, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= 'DarkGreen', marker_line_width=1.5, opacity=1)
fig.add_trace(go.Scatter(x= Ratings[Feat].median()* np.ones(yLim), y= np.arange(0,yLim),
name="Median", line=dict(color='RoyalBlue', width=2, dash='dot')))
fig.add_trace(go.Scatter(x= Ratings[Feat].mean()* np.ones(yLim), y= np.arange(0,yLim), name="Mean",
line=dict(color='Red', width=2, dash='dot')))
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', )
fig['layout']['yaxis'].update(range=[0, yLim])
fig.show()
Feat = 'Number of Ratings'
yLim = int(8e2)
fig = px.histogram(Ratings, x = Feat, nbins=50, marginal= 'box', color_discrete_sequence= ['LightSkyblue'],
hover_data=[Feat])
fig.update_layout(plot_bgcolor= 'white', height = 500)
fig.update_layout(title={'text': 'Average Rating Distribution', 'x':0.5, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.5, opacity=1)
fig.add_trace(go.Scatter(x= Ratings[Feat].median()* np.ones(yLim), y= np.arange(0,yLim),
name="Median", line=dict(color='RoyalBlue', width=2, dash='dot')))
fig.add_trace(go.Scatter(x= Ratings[Feat].mean()* np.ones(yLim), y= np.arange(0,yLim), name="Mean",
line=dict(color='Red', width=2, dash='dot')))
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig['layout']['yaxis'].update(range=[0, yLim])
fig.show()
Movie_Matrix = df.pivot_table(index='User ID', columns='Movie Title', values='Rating')
Movie_Matrix.head()
| Movie Title | 'Til There Was You | 1-900 | 101 Dalmatians | 12 Angry Men | 187 | 2 Days in the Valley | 20,000 Leagues Under the Sea | 2001: A Space Odyssey | 3 Ninjas: High Noon At Mega Mountain | 39 Steps, The | ... | Yankee Zulu | Year of the Horse | You So Crazy | Young Frankenstein | Young Guns | Young Guns II | Young Poisoner's Handbook, The | Zeus and Roxanne | unknown | Á köldum klaka (Cold Fever) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| User ID | |||||||||||||||||||||
| 1 | NaN | NaN | 2.0 | 5.0 | NaN | NaN | 3.0 | 4.0 | NaN | NaN | ... | NaN | NaN | NaN | 5.0 | 3.0 | NaN | NaN | NaN | 4.0 | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | ... | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | NaN | 4.0 | NaN |
5 rows × 1659 columns
Since users might have not watched all movies (only some movies), some of these values appear as NaN.
Checking out the top ten movies that have a rating greater than 4.0 and have the most rated numbers.
Top_Ratings = Ratings[Ratings['Average Rating'] > 4.0].sort_values('Number of Ratings',ascending=False)
Top_list = list(Top_Ratings['Movie Title'])
Top = 10
print(Back.CYAN + "The Top %i by Average Rating:" % Top)
print(Style.RESET_ALL)
#
display(Top_Ratings.head(Top).style.hide_index())
The Top 10 by Average Rating:
| Movie Title | Average Rating | Number of Ratings |
|---|---|---|
| Star Wars | 4.358491 | 583 |
| Fargo | 4.155512 | 508 |
| Return of the Jedi | 4.007890 | 507 |
| Raiders of the Lost Ark | 4.252381 | 420 |
| Godfather, The | 4.283293 | 413 |
| Pulp Fiction | 4.060914 | 394 |
| Silence of the Lambs, The | 4.289744 | 390 |
| Empire Strikes Back, The | 4.204360 | 367 |
| Titanic | 4.245714 | 350 |
| Fugitive, The | 4.044643 | 336 |
In doing so, consider a movie title, for example,
Movie = Top_list[0]
print(Back.YELLOW +'"%s":' % Movie)
"Star Wars":
Now let's consider the user ratings for this movie
Temp = Movie_Matrix[Movie]
We can then use corrwith() method to get correlations between two pandas series:
similar_to_movie = Movie_Matrix.corrwith(Temp)
display(similar_to_movie.to_frame('Pairwise Correlation'))
| Pairwise Correlation | |
|---|---|
| Movie Title | |
| 'Til There Was You | 0.872872 |
| 1-900 | -0.645497 |
| 101 Dalmatians | 0.211132 |
| 12 Angry Men | 0.184289 |
| 187 | 0.027398 |
| ... | ... |
| Young Guns II | 0.228615 |
| Young Poisoner's Handbook, The | -0.007374 |
| Zeus and Roxanne | 0.818182 |
| unknown | 0.723123 |
| Á köldum klaka (Cold Fever) | NaN |
1659 rows × 1 columns
Cleaning the data by removing Nan values and using a DataFrame instead.
df_movie_corr = pd.DataFrame(similar_to_movie,columns=['Correlation']).reset_index(drop = False)
df_movie_corr.dropna(inplace=True)
print('Ten similar movies to "%s":' % Top_list[0])
df_movie_corr.sort_values('Correlation',ascending=False).head(10).style.hide_index()
Ten similar movies to "Star Wars":
| Movie Title | Correlation |
|---|---|
| Man of the Year | 1.000000 |
| Commandments | 1.000000 |
| Cosi | 1.000000 |
| Stripes | 1.000000 |
| Star Wars | 1.000000 |
| Hollow Reed | 1.000000 |
| No Escape | 1.000000 |
| Outlaw, The | 1.000000 |
| Ed's Next Move | 1.000000 |
| Full Speed | 1.000000 |
However, some of these movies only rated by a few users. In caste that we are only interested in movies that have a least 100 reviews. We have,
Ratings[['Movie Title','Number of Ratings']]
| Movie Title | Number of Ratings | |
|---|---|---|
| 0 | Star Wars | 583 |
| 1 | Contact | 509 |
| 2 | Fargo | 508 |
| 3 | Return of the Jedi | 507 |
| 4 | Liar Liar | 485 |
| ... | ... | ... |
| 1654 | Object of My Affection, The | 1 |
| 1655 | Office Killer | 1 |
| 1656 | Other Voices, Other Rooms | 1 |
| 1657 | Paris Was a Woman | 1 |
| 1658 | Á köldum klaka (Cold Fever) | 1 |
1659 rows × 2 columns
df_movie_corr = pd.merge(df_movie_corr, Ratings[['Movie Title','Number of Ratings']] , on = 'Movie Title')
df_movie_corr.head()
| Movie Title | Correlation | Number of Ratings | |
|---|---|---|---|
| 0 | 'Til There Was You | 0.872872 | 9 |
| 1 | 1-900 | -0.645497 | 5 |
| 2 | 101 Dalmatians | 0.211132 | 109 |
| 3 | 12 Angry Men | 0.184289 | 125 |
| 4 | 187 | 0.027398 | 41 |
Now sort the values and notice how the titles make a lot more sense:
df_movie_corr[df_movie_corr['Number of Ratings']>100].sort_values('Correlation',ascending=False).head()
| Movie Title | Correlation | Number of Ratings | |
|---|---|---|---|
| 1188 | Star Wars | 1.000000 | 583 |
| 396 | Empire Strikes Back, The | 0.747981 | 367 |
| 1050 | Return of the Jedi | 0.672556 | 507 |
| 1025 | Raiders of the Lost Ark | 0.536117 | 420 |
| 91 | Austin Powers: International Man of Mystery | 0.377433 | 130 |
We can come with a similar analysis for any other movies from the list. Therefore, we can summarize the results in the next section.
Based on the analysis, we can create the following function that recommends four similar movies to a movie that we just watched.
The following function recommends N movies similar to the Inp movie.
def Movie_Recommendations(Inp, N= 10, at_least= 100, Data = Movie_Matrix, Rate_df= Ratings):
df_Movie = Data[Inp]
Similar_Movies = Data.corrwith(df_Movie)
Movie_corr = pd.DataFrame(Similar_Movies, columns=['Correlation'])
Rate_df = Rate_df.set_index('Movie Title')
Movie_corr = Movie_corr.join(Rate_df['Number of Ratings'])
Move_Rec=Movie_corr[Movie_corr['Number of Ratings']> at_least].sort_values('Correlation',ascending=False).head(N)
Move_Rec_list=list(Move_Rec.index)
print(Back.CYAN +'Ten similar movies to: "%s":' % Inp)
return Move_Rec_list[1:]
For example, consider Star Wars. For this movie, the function recommends the following movies
Movie_Recommendations('Star Wars')
Ten similar movies to: "Star Wars":
['Empire Strikes Back, The', 'Return of the Jedi', 'Raiders of the Lost Ark', 'Austin Powers: International Man of Mystery', 'Sting, The', 'Indiana Jones and the Last Crusade', 'Pinocchio', 'Frighteners, The', 'L.A. Confidential']